Team 1: Quadratic Done by Choi Jae Hyung, Lee Min Young, Nicholas Lui Ming Yang, Tran Duy Anh
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from scipy import stats
from functools import reduce
import sklearn.linear_model as lm
import sklearn
from sklearn.linear_model import LinearRegression as LR
from sklearn.svm import SVR
df = pd.read_csv("project_data.csv")
# df.shape - (1584823, 3)
# df column values:
# localminute - Timestamp
# dataid - MeterID
# meter_value - meter reading
# Since dataid is unique for each meter, we can count the number of unique dataid numbers
df.dataid.value_counts().size
grouped = df.groupby(['dataid'], sort=['localminute'])
def has_decreasing_values(df):
current_value = 0
for index, val in df.iteritems():
if val < current_value:
return True
current_value = val
meters_with_decreasing = (grouped['meter_value']
.agg(has_decreasing_values)
.where(lambda x: x)
.dropna()
.keys())
print(len(meters_with_decreasing))
meters_with_decreasing
## Need to re-sort after filter since filter takes all rows and breaks original sorting
dec_meters = (grouped.filter(lambda x: int(x.name) in meters_with_decreasing)
.groupby(['dataid'], sort=['localminute']))
## Iterate over values to find offending rows for each meter
## WARNING: RUNS VERY SLOWLY. TODO: OPTIMIZE
offending_values = {}
for group_id, rows in dec_meters.groups.items():
offending_values[group_id] = []
current_value = 0
group_rows = dec_meters.get_group(group_id)
group_rows_count = group_rows.shape[0]
for i in range(group_rows_count):
if group_rows.iloc[i]['meter_value'] < current_value:
offending_values[group_id].append([group_rows.iloc[i-1], group_rows.iloc[i]])
current_value = group_rows.iloc[i]['meter_value']
print("Meter ID |", "Number of broken readings")
for k, v in offending_values.items():
print(str(k).ljust(20), len(v))
print("Meter ID |", "Number of broken readings |", "Average decrease across broken readings |", "Percentage of broken readings for meter")
for k, v in offending_values.items():
print(str(k).ljust(20),
str(len(v)).ljust(20),
str(reduce(lambda x, y: x + abs(y[1]['meter_value'] - y[0]['meter_value']) / len(v), [0] + v )).ljust(40),
(len(v) / dec_meters.get_group(k).shape[0]) * 100)
print("Meter ID |", "Number of broken readings |", "Average decrease across broken readings |", "Percentage of broken readings for meter")
for k, v in offending_values.items():
measure = str(reduce(lambda x, y: x + abs(y[1]['meter_value'] - y[0]['meter_value']) / len(v), [0] + v )).ljust(40)
if float(measure) > 10:
print(str(k).ljust(20),
str(len(v)).ljust(20),
measure,
(len(v) / dec_meters.get_group(k).shape[0]) * 100)
"""Use this function to validate/check each bad meter given our heuristic"""
def print_bad_meter_readings(meterID):
meter_readings = offending_values[meterID]
print("time apart |".ljust(20), "meter value initial |", "meter value after |", "difference")
for readings_pair in meter_readings:
print(str(pd.to_datetime(readings_pair[1]['localminute']) - pd.to_datetime(readings_pair[0]['localminute'])).ljust(25),
str(readings_pair[0]['meter_value']).ljust(20),
str(readings_pair[1]['meter_value']).ljust(20),
readings_pair[1]['meter_value'] - readings_pair[0]['meter_value'])
print_bad_meter_readings(1556)
grouped2 = df.groupby(['dataid'], sort=['localminute'])
def has_stagnant_values(df):
current_value = 0
for index, val in df.iteritems():
if index == 0:
current_value = val
continue
if val < current_value + 2 and val >= current_value:
return True
current_value = val
meters_with_stagnant = (grouped['meter_value']
.agg(has_stagnant_values)
.where(lambda x: x)
.dropna()
.keys())
print("Number of meters with stagnant values: ", len(meters_with_stagnant))
# ## Need to re-sort after filter since filter takes all rows and breaks original sorting
# stagnant_meters = (grouped2.filter(lambda x: int(x.name) in meters_with_stagnant)
# .groupby(['dataid'], sort=['localminute']))
# ## Iterate over values to count offending occurences. Not counting value
# ## WARNING: RUNS VERY SLOWLY. TODO: OPTIMIZE
# offending_values2 = {}
# for group_id, rows in stagnant_meters.groups.items():
# offending_values2[group_id] = 0
# group_rows = stagnant_meters.get_group(group_id)
# group_rows_count = group_rows.shape[0]
# # Set current_value so we do not trigger first reading
# current_value = group_rows.iloc[0]['meter_value'] - 5
# for i in range(group_rows_count):
# if group_rows.iloc[i]['meter_value'] < current_value + 2 and group_rows.iloc[i]['meter_value'] >= current_value:
# offending_values2[group_id] += 1
# current_value = group_rows.iloc[i]['meter_value']
# print("Meter ID |", "Number of broken readings |", "Percentage of broken readings for meter")
# for k, v in offending_values2.items():
# print(str(k).ljust(20),
# str(v).ljust(20),
# (v / stagnant_meters.get_group(k).shape[0]) * 100)
stagnant_meters = (grouped2.filter(lambda x: int(x.name) in meters_with_stagnant)
.groupby(['dataid'], sort=['localminute']))
print("Total number of readings to iterate, which is why it is slow: ",
reduce(lambda x, y: x + len(y), [0] + list(stagnant_meters.groups.values())))
# let's use the meter with the most broken readings (based off decreasing values)
most_broken_values_meter = grouped.get_group(5403)
# We are using a stricter requirement now. Let's consider that, if the subsequent meter reading is less
# than an increment of 2 (meters are only supposed to report after at least an increment of 2)
# Broken_criteria is a helper function (comparator function) that takes in two (in-order) readings and outputs a
# boolean of whether the later reading is "broken"
broken_criteria = lambda x, y: y['meter_value'] < x['meter_value'] + 2
broken_readings = 0
num_readings = most_broken_values_meter.shape[0]
for i in range(1, num_readings):
if broken_criteria(most_broken_values_meter.iloc[i-1], most_broken_values_meter.iloc[i]):
broken_readings += 1
print("Number of broken readings for meter 5403, based on stricter requirements: ", broken_readings)
# Let's now create a function that can aggregate a meter's readings into "broken" time periods
# Return value of the function will be as follow:
"""
2D array.
First dimension is the time periods.
Second dimension is the consecutive broken readings that make up the broken time period
To find the actual time data, take the ['localminute'] attribute from the first and last reading per period
time_periods = [
[reading_1, reading_2, reading_3],
[reading_1, reading_2]
]
"""
def get_broken_time_periods(broken_criteria, meter_readings):
num_readings = meter_readings.shape[0]
time_periods = []
temp_period = []
for i in range(1, num_readings):
if broken_criteria(meter_readings.iloc[i-1], meter_readings.iloc[i]):
temp_period.append(meter_readings.iloc[i])
else:
if temp_period:
time_periods.append(temp_period)
temp_period = []
if temp_period:
time_periods.append(temp_period)
return time_periods
broken_time_periods = get_broken_time_periods(broken_criteria, most_broken_values_meter)
print("Number of broken time periods: ", len(broken_time_periods))
Obtain hourly data from a list of given data, and plot the obtained data
The lowest value in an hour should be the closest value to an exact hour (hh:00:00) and a good estimate of that hour's data.
Therefore, the first data point in an hour is used to represent the hourly data. (The assumption is that we are interested in approximating the consumption amount at the start of the hour)
As the data is cumulative gas consumption by household, when there is no data, it can be assumed as there is insignificant gas consumption between that period.
Hence, the missing data will be given same value as most recent hourly data obtained.
This plot of hourly readings does not account for "bad" data readings (consecutive reading with a lower value, which should not happen for cumulative consumption data)
The likelihood of seeing an hourly data point that is bad is Average P(bad readings an hour) (460), since we have max 4 readings per minute
This reduces the code complexity to get a plot, and the usefulness of this approach depends on the usage of this plot
If we are simply looking for a general consumption level visualisation, then this plot should suffice. However, if we are looking for plots with a decrease in value to identify faulty meters, then this plot is insufficient. For that, we cannot hold our original assumption (that readings are cumulative and strictly increasing), and should take the MIN(hour_values) to represent that hour
#change to datetime format
df["localminute"] = pd.to_datetime(df["localminute"])
#get a dataframe of data by hour:
df_byhour = pd.DataFrame(df.groupby([df["localminute"].dt.date.rename("Date"),
df["localminute"].dt.hour.rename("Hour"),
df["dataid"]]).min().reset_index())
# To preview df_byhour sorted by meter and time
# df_byhour = df_byhour.sort_values=(by=['dataid', 'Date', 'Hour'])
#add a column called "DateHour" in the dataframe:
df_byhour['DateHour'] = pd.to_datetime(df_byhour['Date'].apply(str)+' '+df_byhour['Hour'].apply(str)+':00:00')
#Format: ID No.
ID = 739
#ranges from 06/10/2015 00:00:00 to 05/11/2015 23:00:00
start_date = datetime.date(2015, 10, 5)
end_date = datetime.date(2015, 11, 6)
filteredDF = df_byhour[(df_byhour["Date"] > start_date) & (df_byhour["Date"] < end_date)]
#from 06/10/2015 -> 06/11/2015 there are 31 days -> 744 data points
prevHr = pd.Timestamp('2015-10-06 00:00:00')
endHr = pd.Timestamp('2015-11-05 23:00:00')
#generate a temporary dataframe through get_group function with each id
dfTemp = filteredDF.groupby(['dataid']).get_group(ID)
#get the list of hours and meter values
hourList = dfTemp['DateHour'].tolist()
valList = dfTemp['meter_value'].tolist()
#re-initialize the prev val as the first value of the val list
prevVal = valList[0]
#re-initialize the list of values as a list with 1 item
newList = [valList[0]]
#loop through each hour in the list
for index, hr in enumerate(hourList):
#if hour is more than an hour bigger than previous
while (hr - prevHr).seconds >= 3600:
newList.append(prevVal)
#increment every hour
prevHr += pd.Timedelta(seconds=3600)
#update the prev value everytime one index is passed
prevVal = valList[index]
#some data does not reach until the end date, thats where this comes in
while prevHr < endHr:
newList.append(prevVal)
#increment every hour
prevHr += pd.Timedelta(seconds=3600)
x = pd.date_range(datetime.date(2015, 10, 6), datetime.date(2015, 11, 6), freq = 'H').tolist()
#get rid of the last hour which is 00:00:00 06/11/2016
x.pop()
y = newList
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
ax.plot(x, y, 'ok', ms=1)
plt.xlabel ("Time")
plt.ylabel ("Meter value")
ax.set_title('Hourly Data')
plt.show()
#get a list of meter id
idList = df['dataid'].unique()
#there are 183 days => 4392 hours => 4392 readings in each list
#generate a dictionary
hourlyDataDict = {}
#define start hour and end hour
staHr = pd.Timestamp('2015-10-01 05:00:00')
endHr = pd.Timestamp('2016-04-01 04:00:00')
#loop through each id
for id1 in idList:
#re-innitialize the prev Hour as start hour
prevHr = staHr
#generate a temporary dataframe through get_group function with each id
dfTemp = df_byhour.groupby(['dataid']).get_group(id1)
#get the list of hours and meter values
hourList = dfTemp['DateHour'].tolist()
valList = dfTemp['meter_value'].tolist()
#re-initialize the prev pal as the first value of the val list
prevVal = valList[0]
#re-initialize the list of values as a list with 1 item
newList = [valList[0]]
#loop through each hour in the list
for index, hr in enumerate(hourList):
#if hour is more than an hour bigger than previous
while (hr - prevHr).seconds >= 3600:
newList.append(prevVal)
#increment every hour
prevHr += pd.Timedelta(seconds=3600)
#update the prev value everytime one index is passed
prevVal = valList[index]
#some data does not reach until the end date, thats where this comes in
while prevHr < endHr:
newList.append(prevVal)
#increment every hour
prevHr += pd.Timedelta(seconds=3600)
#after all that create a new entry in the Dictionary with the key as the id and value is the list of meter readings
hourlyDataDict[id1] = newList
corDict = {}
#loop through id List to get the first id
for id1 in idList:
#create the empty value for the first key
corDict[id1] = {}
#loop through the id list to get the second id
for id2 in idList:
# generate the coefficient through numpy.corrcoef (can be changed later)
coef = np.corrcoef(hourlyDataDict[id1], hourlyDataDict[id2])[0, 1]
# assign the value to the dict with the appropriate key
corDict[id1][id2] = coef
#print("coefficient between " + str(id1) + " and " + str(id2) + " is: " + str(coef))
from collections import Counter
idList.sort()
top5dict = {}
for id in idList:
#### This is if you just want the houseID for each (first column being the ID)
# print(sorted(corDict[id], key =corDict[id].get, reverse=True)[:6])
#### This is if you want to get the houseID + the value of correlation coefficient
c = Counter(corDict[id])
mc = c.most_common(6)
del mc[0]
top5dict.update({id : mc})
top5dict
On top of magnitude, no. of times the value decrease can be second histogram to observe.
1) Print a summary to inform the company & consumer about the meter's status. "We recommend maintenance because __."
2) Expected amount of payment for the maintenance. "Expected price: $__. Please dial 1800-900-XXXX"
Predicting gas consumption in the future is necessary because of many reasons:
Just to list down a few: customers, government, environmental activists, gas company, maintenance company, gas production countries.
With a good forecasting model, there could be various actions that can be done: Sell the data / models to
This might be also able to predict
(Regarding Q2.2 and Q2.3 future reading estimation beyond the 6 months data, they are shown at the end of Q2.3)
hourArray = features_train = np.asarray(range(4392)).reshape(-1,1)
clfDictLR = {}
for key in hourlyDataDict:
#each key is a linear regression model,
clfDictLR[key] = LR()
#features_train = the number of hour from 5:00:00 01/10/2015,
features_train = hourArray
#print(features_train),
#labels_train = values,
labels_train = np.asarray(hourlyDataDict[key])
#fit the model accordingly,
clfDictLR[key].fit(features_train, labels_train)
hourArray = features_train = np.asarray(range(4392)).reshape(-1,1)
staHr = pd.Timestamp('2015-10-01 05:00:00')
dtStar = datetime.datetime(2015, 10, 1) + datetime.timedelta(hours=5)
dtEnd = datetime.datetime(2016, 4, 1) + datetime.timedelta(hours=4)
#get a list of dateTime from the start to the end\n
dateTimeList = pd.date_range(dtStar, dtEnd, freq = 'H').tolist()
hourList = [[((x - staHr).seconds + (x - staHr).days*86400)/ 3600] for x in dateTimeList]
def newPredLR(meter_id):
return clfDictLR[meter_id].predict(hourList)
for key in hourlyDataDict:
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
plt.xlabel('hours')
plt.ylabel('values')
x = dateTimeList
y = np.asarray(hourlyDataDict[key])
ax.plot(x, y,color='g')
ax.plot(x, newPredLR(key) ,color='k')
plt.title("Linear Regression Time Series Plot, Meter ID = " + str(key))
plt.show()
for key in hourlyDataDict:
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
plt.xlabel('hours')
plt.ylabel('values')
x = dateTimeList
y = np.asarray(hourlyDataDict[key])
ax.scatter(x, y,color='g',s=0.05)
ax.plot(x, newPredLR(key) ,color='k')
plt.title("Linear Regression Scatter Plot, Meter ID = " + str(key))
plt.show()
As shown in Q2.2, there are multiple meters that are having malfunctioning. Amongst the malfunctioning meters, some jump in meter readings exceptionally. This spike increase may cause the prediction to be inaccurate.
There is a similar pattern in household gas consumptions. Hence taking the average value of all the gas consumption of households every hour gives general hourly consumption thereby reducing the estimation error made by huge jump in values of some meters.
hourlyConsumptionDataDict = {}
for key in hourlyDataDict:
# all values in hourly data dict is subtraced by the first value
hourlyConsumptionDataDict[key] = [x - hourlyDataDict[key][0] for x in hourlyDataDict[key]]
# calculate the average across all meter for each hour
avgDataList = []
for i in range(4392):
total = 0
for key in hourlyConsumptionDataDict:
total += hourlyConsumptionDataDict[key][i]
#print('total is: ' + str(total))
avg = total/157
avgDataList.append(avg)
hourArray = features_train = np.asarray(range(4392)).reshape(-1,1)
features_train = hourArray
labels_train = np.asarray(avgDataList)
clfLR = LR()
clfLR.fit(features_train, labels_train)
clfSVR = SVR(kernel='linear')
clfSVR.fit(features_train, labels_train)
staHr = pd.Timestamp('2015-10-01 05:00:00')
dtStar = datetime.datetime(2015, 10, 1) + datetime.timedelta(hours=5)
dtEnd = datetime.datetime(2016, 4, 1) + datetime.timedelta(hours=4)
#get a list of dateTime from the start to the end\n
dateTimeList = pd.date_range(dtStar, dtEnd, freq = 'H').tolist()
hourList = [[((x - staHr).seconds + (x - staHr).days*86400)/ 3600] for x in dateTimeList]
staHr = pd.Timestamp('2015-10-01 05:00:00')
dtStar_future = datetime.datetime(2015, 10, 1) + datetime.timedelta(hours=5)
dtEnd_future = datetime.datetime(2016, 10, 1) + datetime.timedelta(hours=4)
#get a list of dateTime from the start to the end\n
dateTimeList_future = pd.date_range(dtStar_future, dtEnd_future, freq = 'H').tolist()
hourList_future = [[((x - staHr).seconds + (x - staHr).days*86400)/ 3600] for x in dateTimeList_future]
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
plt.xlabel('hours')
plt.ylabel('values')
x = dateTimeList
y = np.asarray(avgDataList)
ax.plot(x, y,color='g')
ax.plot(x, clfLR.predict(hourList) ,color='r')
ax.plot(x, clfSVR.predict(hourList) ,color='b')
plt.title("Linear Regression and Support Vector Regression, with Times Series Plot")
plt.show()
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
plt.xlabel('hours')
plt.ylabel('values')
x = dateTimeList
y = np.asarray(avgDataList)
ax.scatter(x, y,color='g',s=0.05)
ax.plot(x, clfLR.predict(hourList) ,color='r')
ax.plot(x, clfSVR.predict(hourList) ,color='b')
plt.title("Linear Regression and Support Vector Regression, with Scatter Plot")
plt.show()
By averaging the data, the consumption generally increases, looking more like a proper cumulative gas consumption data.
Linear Regression & Support Vector Regression looks almost the same!
Let's find out by predicting the future value, beyond the 6 months data given.
Time series plot is shown for future data prediction.
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
plt.xlabel('hours')
plt.ylabel('values')
x = dateTimeList
y = np.asarray(avgDataList)
ax.plot(x, y,color='g')
ax.plot(dateTimeList_future, clfLR.predict(hourList_future) ,color='r')
ax.plot(dateTimeList_future, clfSVR.predict(hourList_future) ,color='b')
plt.title("Predicting Future Data with Linear Regression and Support Vector Regression, with Times Series Plot")
plt.show()
As per section 1.4:
Focus: Find out which meter needs maintenance
Draw histogram to see the number of faulty meters with various magnitude of decreasing value. (Prof. Mehul's recommendation to visualize)
On top of magnitude, no. of times the value decrease can be second histogram to observe.
Set our own standard from the visualization and calculate number of meters in need of maintenance
Follow-up idea:
1) Print a summary to inform the company & consumer about the meter's status. "We recommend maintenance because __."
2) Expected amount of payment for the maintenance. "Expected price: $__. Please dial 1800-900-XXXX"
# offending_values is the variable storing all "broken meter" values
# That is consecutive readings with decreasing consumption
# Let's format this data for plotting a histogram of
# magnitude of decreasing values, to determine if there is a pattern to the broken meters
# (is there a systematic decrease)?
decreasing_histogram_values = []
for readings in offending_values.values():
for reading in readings:
decreasing_histogram_values.append({
"meter_id": reading[0].dataid,
"decrease_magnitude": reading[0].meter_value - reading[1].meter_value
})
decreasing_values_df = pd.DataFrame(decreasing_histogram_values)
decreasing_values_hist = decreasing_values_df.hist(bins=100, column=['decrease_magnitude'])
broken_reading_count_df = decreasing_values_df.groupby(['meter_id']).agg(lambda x: x.shape[0]).rename({'decrease_magnitude': 'broken_count'}, axis='columns')
# Add in meters with no broken readings
appending_values = []
offending_keys = offending_values.keys()
for key in grouped.groups.keys():
if key not in offending_keys:
appending_values.append({
"meter_id": key,
"broken_count": 0
})
broken_reading_count_all_df = broken_reading_count_df.append(pd.DataFrame(appending_values).set_index('meter_id'))
broken_reading_count_all_hist = broken_reading_count_all_df.hist(bins=5)
broken_reading_count_hist = broken_reading_count_df.hist(bins=50)
broken_reading_count_filtered_hist = (broken_reading_count_df
.where(broken_reading_count_df['broken_count'] > 5)
.dropna()
.hist(bins=50))
broken_freq_values = []
for key in grouped.groups.keys():
broken_values = 0
broken_freq = 0
total_values = grouped.get_group(key).shape[0]
if key in offending_values:
broken_values = len(offending_values[key])
broken_freq = broken_values / total_values * 100
broken_freq_values.append({
"meter_id": key,
"broken_readings": broken_values,
"total_readings": total_values,
"broken_frequency": broken_freq
})
broken_freq_df = pd.DataFrame(broken_freq_values).set_index('meter_id')
broken_freq_hist = broken_freq_df.hist(bins=10, column="broken_frequency")
broken_freq_filtered_hist = (broken_freq_df
.where(broken_freq_df['broken_frequency'] > 0)
.hist(bins = 10, column="broken_frequency"))
broken_readings_cumsum = broken_reading_count_all_df.sort_values(['broken_count']).cumsum()
# change index beceause meter_id is no longer in-order, and then plot cumulative sum
broken_readings_cumsum.reset_index().plot(y="broken_count", title="cumulative sum of broken readings")
### Additional stats to support hypothesis
print ("70% of broken readings: ", 0.7 * broken_readings_cumsum.max().broken_count)
print ("Number of meters required to fill 70% of readings: ",
broken_readings_cumsum
.where(broken_readings_cumsum.broken_count
> 0.7 * broken_readings_cumsum.max().broken_count).dropna().shape[0])
We have earlier explored this as 3 separate measures
1) Absolute number of broken readings
2) Frequency of broken readings
3) Magnitude (or severity) of broken readings
A scoring system can be implemented, to give each meter a "broken" score across a certain time window. This will then be used to prioritize resources
These factors are related because they can be aggregated to calculate the losses incurred from each meter owing to erroneous readings.
What is missing from this data set is the actual gas consumption. With that, we can be sure of the actual extent of misreporting incurred be erroneous readings
# 8156, 5403
ten_most_broken_meter_ids = broken_readings_cumsum.tail(10).index.values
for meter_id in ten_most_broken_meter_ids:
grouped.get_group(meter_id).plot(x="localminute", y="meter_value", title=str(meter_id)+" consumption")
Rather than marking these meters as broken and wasting time fixing them, we can instead seek to understand what caused the broken readings.
Instead of simply marking decreasing readings to determine whether meters are broken, we can additionally check if the decreasing readings are instead a result of spikes, and whether they normalize after some time